In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [12]:
import nltk
from tethne.readers import zotero
import matplotlib.pyplot as plt
from helpers import normalize_token, filter_token
import pandas as pd

Feature selection: keywords

A major problem-area in text mining is determining the thematic or topical content of texts. One of the most basic problems in this area is to identify the terms in a text -- "keywords" -- that most accurately represents its distinctive thematic characteristics.

In this notebook, we will use Dunning's log-likelihood statistic to identify keywords for individual documents in a collection of texts. It is fairly typical that methods used for statistical analysis are also used for information extraction and classification.

We'll use the Embryo Project corpus from earlier notebooks. Recall that the plain text documents are stored separately from their metadata -- this is the format that you would expect from a Zotero RDF export.


In [4]:
documents = nltk.corpus.PlaintextCorpusReader('../data/EmbryoProjectTexts/files', 'https.+')
metadata = zotero.read('../data/EmbryoProjectTexts', index_by='link', follow_links=False)

Our first step is to count up all of the words in each of the documents. This conditional frequency distribution should look familiar by now.


In [7]:
wordcounts_per_document = nltk.ConditionalFreqDist([
    (fileid, normalize_token(token))
     for fileid in documents.fileids()
     for token in documents.words(fileids=[fileid])
     if filter_token(token)
])

In [5]:
from scipy import sparse

In [111]:
# We pick a single "focal" document that we want to characterize.
focal_fileid = documents.fileids()[3]

# Since this procedure will involve numerical matrices, we
#  need to map documents and words onto row and column indices.
#  These "dictionaries" will help us to keep track of those
#  mappings.
document_index = {}    # Maps int -> fileid (str).
vocabulary = {}        # Maps int -> word (str).
lookup = {}            # Reverse map for vocabulary (word (str) -> int).

# Containers for sparse data.
I = []          # Document vector.
J = []          # Word vector.
data = []       # Word count vector.
labels = []     # Vector of labels; either the URI of interest, or "Other".

# Here we transform the ConditionalFrequencyDist into three vectors (I, J, data)
#  that sparsely describe the document-word count matrix.
for i, (fileid, counts) in enumerate(wordcounts_per_document.iteritems()):
    document_index[i] = fileid
    for token, count in counts.iteritems():
        # Removing low-frequency terms is optional, but speeds things up
        #  quite a bit for this demonstration.
        if count < 3:
            continue
        
        # get() lets us 
        j = lookup.get(token, len(vocabulary))
        vocabulary[j] = token
        lookup[token] = j
        
        I.append(i)
        J.append(j)
        data.append(count)
    labels.append(fileid if fileid == focal_fileid else 'Other')
    print '\r', i,


627

In [112]:
sparse_matrix = sparse.coo_matrix((data, (I, J)))

In [113]:
sparse_matrix.shape


Out[113]:
(628, 7033)

In [ ]:
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer

In [ ]:


In [114]:
keyness, _ = chi2(sparse_matrix, labels)

In [115]:
ranking = np.argsort(keyness)[::-1]

In [116]:
_, words = zip(*sorted(vocabulary.items(), key=lambda i: i[0]))

In [117]:
words = np.array(words)

In [118]:
keywords = words[ranking]

In [119]:
zip(keywords[:20], keyness[ranking][:20])


Out[119]:
[(u'occurred', 937.50239234449759),
 (u'fertilized', 796.9760972679635),
 (u'laparoscopy', 551.63727500569598),
 (u'successfully', 521.70309745656004),
 (u'icsi', 508.64564303320293),
 (u'usually', 371.41531100478466),
 (u'ivf', 363.83650540492647),
 (u'louise', 327.0183882165307),
 (u'brown', 316.68266707825279),
 (u'vitro', 315.16533758639025),
 (u'egg', 281.59206727562702),
 (u'successful', 265.95285262598281),
 (u'implant', 263.59603554340396),
 (u'attempt', 243.6516746411483),
 (u'fertilization', 229.0308302413566),
 (u'mature', 210.84376950280841),
 (u'uterus', 197.3138013152433),
 (u'steptoe', 192.15841200051727),
 (u'rabbit', 157.06855439642325),
 (u'wall', 155.78951925267714)]

In [124]:
def extract_keywords(fileid, n=20):
    print '\r', fileid,
    document_index = {}    # Maps int -> fileid (str).
    vocabulary = {}    # Maps int -> word (str).
    lookup = {}    # Reverse map for vocabulary (word (str) -> int).

    I = []
    J = []
    data = []
    labels = []

    for i, (key, counts) in enumerate(wordcounts_per_document.iteritems()):
        document_index[i] = key
        for token, count in counts.iteritems():
            if count < 3:
                continue
            j = lookup.get(token, len(vocabulary))
            vocabulary[j] = token
            lookup[token] = j

            I.append(i)
            J.append(j)
            data.append(count)
        labels.append(key if key == fileid else 'Other')
        
    sparse_matrix = sparse.coo_matrix((data, (I, J)))
    
    keyness, _ = chi2(sparse_matrix, labels)
    ranking = np.argsort(keyness)[::-1]
    _, words = zip(*sorted(vocabulary.items(), key=lambda i: i[0]))
    words = np.array(words)
    keywords = words[ranking]
    return keywords[:n]

In [125]:
keywords = [extract_keywords(fileid) for fileid in documents.fileids()]


https--____hpsrepository.asu.edu__handle__10776__8688.txt

In [ ]: